InĀ [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
InĀ [2]:
df=pd.read_csv('train_values.csv')
InĀ [3]:
df_labels=pd.read_csv('train_labels.csv')
InĀ [4]:
df = pd.merge(df, df_labels, on='building_id')
InĀ [5]:
df.head()
Out[5]:
building_id geo_level_1_id geo_level_2_id geo_level_3_id count_floors_pre_eq age area_percentage height_percentage land_surface_condition foundation_type ... has_secondary_use_hotel has_secondary_use_rental has_secondary_use_institution has_secondary_use_school has_secondary_use_industry has_secondary_use_health_post has_secondary_use_gov_office has_secondary_use_use_police has_secondary_use_other damage_grade
0 802906 6 487 12198 2 30 6 5 t r ... 0 0 0 0 0 0 0 0 0 3
1 28830 8 900 2812 2 10 8 7 o r ... 0 0 0 0 0 0 0 0 0 2
2 94947 21 363 8973 2 10 5 5 t r ... 0 0 0 0 0 0 0 0 0 3
3 590882 22 418 10694 2 10 6 5 t r ... 0 0 0 0 0 0 0 0 0 2
4 201944 11 131 1488 3 30 8 9 t r ... 0 0 0 0 0 0 0 0 0 3

5 rows Ɨ 40 columns

InĀ [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 40 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null  object
 11  ground_floor_type                       260601 non-null  object
 12  other_floor_type                        260601 non-null  object
 13  position                                260601 non-null  object
 14  plan_configuration                      260601 non-null  object
 15  has_superstructure_adobe_mud            260601 non-null  int64 
 16  has_superstructure_mud_mortar_stone     260601 non-null  int64 
 17  has_superstructure_stone_flag           260601 non-null  int64 
 18  has_superstructure_cement_mortar_stone  260601 non-null  int64 
 19  has_superstructure_mud_mortar_brick     260601 non-null  int64 
 20  has_superstructure_cement_mortar_brick  260601 non-null  int64 
 21  has_superstructure_timber               260601 non-null  int64 
 22  has_superstructure_bamboo               260601 non-null  int64 
 23  has_superstructure_rc_non_engineered    260601 non-null  int64 
 24  has_superstructure_rc_engineered        260601 non-null  int64 
 25  has_superstructure_other                260601 non-null  int64 
 26  legal_ownership_status                  260601 non-null  object
 27  count_families                          260601 non-null  int64 
 28  has_secondary_use                       260601 non-null  int64 
 29  has_secondary_use_agriculture           260601 non-null  int64 
 30  has_secondary_use_hotel                 260601 non-null  int64 
 31  has_secondary_use_rental                260601 non-null  int64 
 32  has_secondary_use_institution           260601 non-null  int64 
 33  has_secondary_use_school                260601 non-null  int64 
 34  has_secondary_use_industry              260601 non-null  int64 
 35  has_secondary_use_health_post           260601 non-null  int64 
 36  has_secondary_use_gov_office            260601 non-null  int64 
 37  has_secondary_use_use_police            260601 non-null  int64 
 38  has_secondary_use_other                 260601 non-null  int64 
 39  damage_grade                            260601 non-null  int64 
dtypes: int64(32), object(8)
memory usage: 79.5+ MB
InĀ [7]:
df.describe()
Out[7]:
building_id geo_level_1_id geo_level_2_id geo_level_3_id count_floors_pre_eq age area_percentage height_percentage has_superstructure_adobe_mud has_superstructure_mud_mortar_stone ... has_secondary_use_hotel has_secondary_use_rental has_secondary_use_institution has_secondary_use_school has_secondary_use_industry has_secondary_use_health_post has_secondary_use_gov_office has_secondary_use_use_police has_secondary_use_other damage_grade
count 2.606010e+05 260601.000000 260601.000000 260601.000000 260601.000000 260601.000000 260601.000000 260601.000000 260601.000000 260601.000000 ... 260601.000000 260601.000000 260601.000000 260601.000000 260601.000000 260601.000000 260601.000000 260601.000000 260601.000000 260601.000000
mean 5.256755e+05 13.900353 701.074685 6257.876148 2.129723 26.535029 8.018051 5.434365 0.088645 0.761935 ... 0.033626 0.008101 0.000940 0.000361 0.001071 0.000188 0.000146 0.000088 0.005119 2.238272
std 3.045450e+05 8.033617 412.710734 3646.369645 0.727665 73.565937 4.392231 1.918418 0.284231 0.425900 ... 0.180265 0.089638 0.030647 0.018989 0.032703 0.013711 0.012075 0.009394 0.071364 0.611814
min 4.000000e+00 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000 2.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
25% 2.611900e+05 7.000000 350.000000 3073.000000 2.000000 10.000000 5.000000 4.000000 0.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000
50% 5.257570e+05 12.000000 702.000000 6270.000000 2.000000 15.000000 7.000000 5.000000 0.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000
75% 7.897620e+05 21.000000 1050.000000 9412.000000 2.000000 30.000000 9.000000 6.000000 0.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.000000
max 1.052934e+06 30.000000 1427.000000 12567.000000 9.000000 995.000000 100.000000 32.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 3.000000

8 rows Ɨ 32 columns

InĀ [8]:
df.drop('building_id', axis=1,inplace =True)
InĀ [9]:
df.head()
Out[9]:
geo_level_1_id geo_level_2_id geo_level_3_id count_floors_pre_eq age area_percentage height_percentage land_surface_condition foundation_type roof_type ... has_secondary_use_hotel has_secondary_use_rental has_secondary_use_institution has_secondary_use_school has_secondary_use_industry has_secondary_use_health_post has_secondary_use_gov_office has_secondary_use_use_police has_secondary_use_other damage_grade
0 6 487 12198 2 30 6 5 t r n ... 0 0 0 0 0 0 0 0 0 3
1 8 900 2812 2 10 8 7 o r n ... 0 0 0 0 0 0 0 0 0 2
2 21 363 8973 2 10 5 5 t r n ... 0 0 0 0 0 0 0 0 0 3
3 22 418 10694 2 10 6 5 t r n ... 0 0 0 0 0 0 0 0 0 2
4 11 131 1488 3 30 8 9 t r n ... 0 0 0 0 0 0 0 0 0 3

5 rows Ɨ 39 columns

InĀ [10]:
df['age'].unique()
Out[10]:
array([ 30,  10,  25,   0,  15,  20,  45,  55,   5,  40,  80,  60,  35,
        70,  50,  65, 100,  75,  85, 190, 995, 105,  90, 120,  95, 110,
       115, 150, 200, 130, 125, 140, 155, 160, 175, 135, 145, 195, 180,
       165, 170, 185], dtype=int64)
InĀ [11]:
df['age'].value_counts()
Out[11]:
age
10     38896
15     36010
5      33697
20     32182
0      26041
25     24366
30     18028
35     10710
40     10559
50      7257
45      4711
60      3612
80      3055
55      2033
70      1975
995     1390
100     1364
65      1123
90      1085
85       847
75       512
95       414
120      180
150      142
200      106
110      100
105       89
125       37
115       21
130        9
140        9
180        7
160        6
170        6
175        5
135        5
190        3
145        3
195        2
165        2
155        1
185        1
Name: count, dtype: int64
InĀ [12]:
df = df.drop(df[df.age == 995].index)
InĀ [13]:
df['age'].unique()
Out[13]:
array([ 30,  10,  25,   0,  15,  20,  45,  55,   5,  40,  80,  60,  35,
        70,  50,  65, 100,  75,  85, 190, 105,  90, 120,  95, 110, 115,
       150, 200, 130, 125, 140, 155, 160, 175, 135, 145, 195, 180, 165,
       170, 185], dtype=int64)
InĀ [14]:
sns.boxplot(x=df['age'],y=df['count_floors_pre_eq'])
Out[14]:
<Axes: xlabel='age', ylabel='count_floors_pre_eq'>
No description has been provided for this image
InĀ [15]:
plt.scatter(df['age'],df['count_floors_pre_eq'])
Out[15]:
<matplotlib.collections.PathCollection at 0x209c1bbc4d0>
No description has been provided for this image
InĀ [16]:
sns.relplot(data = df, x = "age", y = "count_floors_pre_eq", size = "damage_grade", sizes = (15, 200))
Out[16]:
<seaborn.axisgrid.FacetGrid at 0x209c04a6690>
No description has been provided for this image
InĀ [17]:
#sns.relplot(data = df, kind = "line", x = "age", y = "count_floors_pre_eq", hue = "")
InĀ [18]:
df['has_superstructure_bamboo'].value_counts()
Out[18]:
has_superstructure_bamboo
0    237201
1     22010
Name: count, dtype: int64
InĀ [19]:
df_mud_adobe_mud = (df["has_superstructure_mud_mortar_stone"]==1).sum()
df_adobe_mud = (df["has_superstructure_adobe_mud"]==1).sum()
df_stone_flag = (df["has_superstructure_stone_flag"]==1).sum()
df_cement_mortar_stone = (df["has_superstructure_cement_mortar_stone"]==1).sum()
df_mud_mortar_brick = (df["has_superstructure_mud_mortar_brick"]==1).sum()
df_cement_mortar_brick = (df["has_superstructure_cement_mortar_brick"]==1).sum()
df_timber = (df["has_superstructure_timber"]==1).sum()
df_bamboo = (df["has_superstructure_bamboo"]==1).sum()
df_rc_non_engineered = (df["has_superstructure_rc_non_engineered"]==1).sum()
df_rc_engineered = (df["has_superstructure_rc_engineered"]==1).sum()
df_other = (df["has_superstructure_other"]==1).sum()
InĀ [20]:
#df_adobe_mud, df_bamboo, df_mud_mortar_stone, df_stone_flag, df_cement_mortar_stone, df_cement_mortar_brick, df_timber, df_rc_non_engineered, df_rc_engineered, df_other
InĀ [21]:
df_mud_adobe_mud
Out[21]:
197524
InĀ [22]:
df_bamboo
Out[22]:
22010
InĀ [23]:
superstructure_counts = {
    'Mud Mortar Stone': df_mud_adobe_mud,
    'Adobe Mud': df_adobe_mud,
    'Stone Flag': df_stone_flag,
    'Cement Mortar Stone': df_cement_mortar_stone,
    'Mud Mortar Brick': df_mud_mortar_brick,
    'Cement Mortar Brick': df_cement_mortar_brick,
    'Timber': df_timber,
    'Bamboo': df_bamboo,
    'RC Non-Engineered': df_rc_non_engineered,
    'RC Engineered': df_rc_engineered,
    'Other': df_other
}

plt.figure(figsize=(12, 6))
plt.bar(superstructure_counts.keys(), superstructure_counts.values(), color='skyblue')
plt.title('Counts of Buildings by Superstructure Type')
plt.xlabel('Superstructure Type')
plt.ylabel('Number of Buildings')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.savefig("building.png")
plt.show()
No description has been provided for this image
InĀ [24]:
data = df[['age', 'count_floors_pre_eq', 'damage_grade']]

grouped_data = data.groupby(['age', 'count_floors_pre_eq']).mean().reset_index()

ages = grouped_data['age']
floors = grouped_data['count_floors_pre_eq']
damage_grade = grouped_data['damage_grade']

plt.figure(figsize=(10, 6))
plt.scatter(ages, floors, s=damage_grade*50, c=damage_grade, cmap='coolwarm', alpha=0.8)
plt.colorbar(label='Damage Grade')
plt.xlabel('Age')
plt.ylabel('Number of Floors')
plt.title('Damage Grade by Age and Number of Floors')
plt.grid(True)
plt.show()
No description has been provided for this image
InĀ [25]:
from ydata_profiling import ProfileReport
InĀ [26]:
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
InĀ [27]:
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[27]:

InĀ [28]:
profile.to_file("output.html")
Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]